We start by loading the packages required (use install.packages if needed).
library(dplyr)
library(Seurat)
library(Matrix)
library(ggplot2)
library(sctransform)
library(reticulate)
library(SeuratWrappers)
library(harmony)
library(reshape2)
Based on Inbal Shanier scripts IEGs expression can be elevated in certain cells as a response to the dissection and dissociation of the cells. This can hamper the analysis of the cell types and therefore we will exclude IEGs from the analysis (see this paper for additional information: https://pubmed.ncbi.nlm.nih.gov/29024657/). We have curated a list of zebrafish known IEGs, and we’ll use this function to exclude them:
remove_genes <- function(dgc_mat, gene_list) {
#the function receives a dgc gene matrix and a list of genes and returns a dgc gene matrix without the gene rows. The loaded 10x data is a dgc matrix.
ieg.indices = row.names(dgc_mat) %in% gene_list
dgc_mat.mod = dgc_mat[!ieg.indices,]
return (dgc_mat.mod)
}
Let’s load our IEGs curated list:
IEGs<- readLines("~/zfish_RGC_atlas/IEG list.csv")
We will load our data from the 10x matrix files. Then we’ll remove the IEGs, create a seurat object, and add additional metadata about the larvae age. We will do that for each of the scSeq batches.
# load the 10x data:
larva.7dpf.data<- readRDS("~/zfish_RGC_atlas/larva_zFish_FINAL.rds")
# Remove ZfishRGC17 due to wetting sample failure
cells.remove = grep("ZfishRGC17_",colnames(larva.7dpf.data), value=TRUE)
larva.7dpf.data = larva.7dpf.data[, setdiff(colnames(larva.7dpf.data), cells.remove)]
# remove the IEGs using the function and the list described above.
larva.7dpf.data.no.iegs <- remove_genes(larva.7dpf.data, IEGs)
# if needed to create a seurat object. rds files already have a Seurat object
#larva.no.iegs <- CreateSeuratObject(counts = larva.7dpf.data.no.iegs, project = "larvaRGC", min.cells = 25, min.features = 450)
# add metadata about the larvae age
larva.no.iegs<- AddMetaData(object=larva.7dpf.data.no.iegs, metadata = "7dpf", col.name="age")
Get the summary of the generated object. Features is the number of genes, samples is the number of cells.
larva <-larva.no.iegs
larva[["percent.mt"]] <- PercentageFeatureSet(larva, pattern = "^MT-")
larva[["percent.rps"]] <- PercentageFeatureSet(larva, pattern = "^RPS")
larva[["percent.rpl"]] <- PercentageFeatureSet(larva, pattern = "^RPL")
larva[["percent.rp"]] <- larva[["percent.rps"]] + larva[["percent.rpl"]]
# Create Violin Plots of RNA counts, mitochondrial
# percentages, and ribosomal percentages
VlnPlot(larva, features = "nCount_RNA", pt.size = 0.3)
head(larva@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## ZfishRGC18_AAACCTGAGAGGGCTT-1 ZfishRGC18 1440 859 0
## ZfishRGC18_AAACCTGAGTTGTAGA-1 ZfishRGC18 2296 1229 0
## ZfishRGC18_AACACGTCATGCCTTC-1 ZfishRGC18 3399 1501 0
## ZfishRGC18_AAGACCTCAAGAGGCT-1 ZfishRGC18 2254 1090 0
## ZfishRGC18_ACACCAACATGCTAGT-1 ZfishRGC18 1429 878 0
## ZfishRGC18_ACACCGGCACCAGGCT-1 ZfishRGC18 1811 1046 0
## percent.rps percent.rpl percent.rp clusterID
## ZfishRGC18_AAACCTGAGAGGGCTT-1 0 0 0 19
## ZfishRGC18_AAACCTGAGTTGTAGA-1 0 0 0 19
## ZfishRGC18_AACACGTCATGCCTTC-1 0 0 0 19
## ZfishRGC18_AAGACCTCAAGAGGCT-1 0 0 0 19
## ZfishRGC18_ACACCAACATGCTAGT-1 0 0 0 19
## ZfishRGC18_ACACCGGCACCAGGCT-1 0 0 0 19
## dendro_order age
## ZfishRGC18_AAACCTGAGAGGGCTT-1 19 7dpf
## ZfishRGC18_AAACCTGAGTTGTAGA-1 19 7dpf
## ZfishRGC18_AACACGTCATGCCTTC-1 19 7dpf
## ZfishRGC18_AAGACCTCAAGAGGCT-1 19 7dpf
## ZfishRGC18_ACACCAACATGCTAGT-1 19 7dpf
## ZfishRGC18_ACACCGGCACCAGGCT-1 19 7dpf
Repeat for the adult data
# load the 10x data:
adult.7dpf.data<- readRDS("~/zfish_RGC_atlas/adult_zFish_FINAL.rds")
# Remove ZfishRGC17 due to wetting sample failure
cells.remove = grep("ZfishRGC17_",colnames(adult.7dpf.data), value=TRUE)
adult.7dpf.data = adult.7dpf.data[, setdiff(colnames(adult.7dpf.data), cells.remove)]
# remove the IEGs using the function and the list described above.
adult.7dpf.data.no.iegs <- remove_genes(adult.7dpf.data, IEGs)
# if needed to create a seurat object. rds files already have a Seurat object
#adult.no.iegs <- CreateSeuratObject(counts = adult.7dpf.data.no.iegs, project = "adultRGC", min.cells = 25, min.features = 450)
# add metadata about the adulte age
adult.no.iegs<- AddMetaData(object=adult.7dpf.data.no.iegs, metadata = "7dpf", col.name="age")
adult <-adult.no.iegs
head(adult@meta.data)
## orig.ident batch nCount_RNA nFeature_RNA
## ZfishRGC1_AAACCTGGTCCAGTGC-1 ZfishRGC1 Batch1 1212 765
## ZfishRGC1_ACGGAGAAGAAACGAG-1 ZfishRGC1 Batch1 687 496
## ZfishRGC1_ACTGAGTTCTAACCGA-1 ZfishRGC1 Batch1 772 505
## ZfishRGC1_AGTTGGTCAGCCAGAA-1 ZfishRGC1 Batch1 1332 807
## ZfishRGC1_ATCATCTTCCTATGTT-1 ZfishRGC1 Batch1 1113 709
## ZfishRGC1_ATCGAGTTCACTTCAT-1 ZfishRGC1 Batch1 768 550
## percent.mt percent.rps percent.rpl percent.rp
## ZfishRGC1_AAACCTGGTCCAGTGC-1 6.219313 2.536825 3.518822 6.055646
## ZfishRGC1_ACGGAGAAGAAACGAG-1 4.317549 2.506964 4.596100 7.103064
## ZfishRGC1_ACTGAGTTCTAACCGA-1 7.779172 4.642409 4.642409 9.284818
## ZfishRGC1_AGTTGGTCAGCCAGAA-1 9.356725 2.266082 3.143275 5.409357
## ZfishRGC1_ATCATCTTCCTATGTT-1 11.488251 1.131419 2.001741 3.133159
## ZfishRGC1_ATCGAGTTCACTTCAT-1 4.156171 3.274559 5.163728 8.438287
## clusterID dendro_order age
## ZfishRGC1_AAACCTGGTCCAGTGC-1 13 13 7dpf
## ZfishRGC1_ACGGAGAAGAAACGAG-1 13 13 7dpf
## ZfishRGC1_ACTGAGTTCTAACCGA-1 13 13 7dpf
## ZfishRGC1_AGTTGGTCAGCCAGAA-1 13 13 7dpf
## ZfishRGC1_ATCATCTTCCTATGTT-1 13 13 7dpf
## ZfishRGC1_ATCGAGTTCACTTCAT-1 13 13 7dpf
Lets remove the unneeded objects from the environment to free memory. We will only need the merged seurat object from now on.
rm(larva.7dpf.data,
larva.7dpf.data.no.iegs,
larva.no.iegs,
adult.7dpf.data,
adult.7dpf.data.no.iegs,
adult.no.iegs)
Removal of bad quality cells: 1. Cell with high mitochondrial gene expression which usually sugests these are stressed cells. 2. Cells with unusually high gene or UMI number, which usuaaly suggests there are douplets or triplets (originate from droplets containing more than one cell).
We first calculate mitochondrial QC metrics with the PercentageFeatureSet function, which calculates the percentage of counts originating from a set of features. Mitochndrial genes have the prefix ‘mt-’, which is used to identify them.
larva[["percent.mt"]] <- PercentageFeatureSet(object = larva, pattern = "^mt-")
Lets visualize QC metrics as a violin plot (the features are the genes, the count is the number of UMIs).The visualization is according to the batch, which also help us to observe batch quality differences.
VlnPlot(object = larva, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size=1)
adult[["percent.mt"]] <- PercentageFeatureSet(object = adult, pattern = "^mt-")
VlnPlot(object = adult, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size=1)
We’ll now filter out the cells with unusual number of genes (nFeature_RNA), precent.mt and UMI (nCount_RNA). What is unusual? No ground truth role here. It can vary with the tissue examined, the efficiency the 10x or the sequencing machine. We can examine the violin plot and get a feeling of the outliers in our data. Additionally for mitochondrial genes, the avg. in neurons is about 12.5%, so I maybe the threshold should be higher to capture most of the population variance. The most important part is to check how the downstream analysis looks like, and return and refine the threshold set here. For example, if in the downstream analysis mitochondrial genes are highly present in the variable genes, the bar was set too high. If some cells have marker genes of two different cluster, maybe there are a lot duplets in the data and the threshold for gene number and UMI should be lower.
The subset command is used to keep the cells with certain characteristics:
larva <- subset(x = larva, subset = nFeature_RNA > 200 & nFeature_RNA < 4000 & percent.mt < 20 & nCount_RNA < 6000)
How many cells have past the threshold and will be used for the down stream analysis:
sum(table(...=larva@active.ident))
## [1] 11380
adult <- subset(x = adult, subset = nFeature_RNA > 200 & nFeature_RNA < 4000 & percent.mt < 20 & nCount_RNA < 6000)
sum(table(...=adult@active.ident))
## [1] 30483
We will now normalize and scale the data. Normalize the data by a global-scaling normalization method. LogNormalize: Feature counts for each cell are divided by the total counts for that cell and multiplied by the scale.factor. This is then natural-log transformed using log1p. More normalization methods can be used (CLR and Relative counts). Check the function help for more information.
larva <- NormalizeData(object = larva,
normalization.method = "LogNormalize",
scale.factor = 10000)
adult <- NormalizeData(object = adult,
normalization.method = "LogNormalize",
scale.factor = 10000)
Identification of highly variable features (feature selection). Identifies features that are outliers on a ‘mean variability plot’.The default- 2,000 features per dataset. vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
larva <- FindVariableFeatures(object = larva,
selection.method = "vst",
nfeatures = 2000)
adult <- FindVariableFeatures(object = adult,
selection.method = "vst",
nfeatures = 2000)
Scaling the data: Apply a linear transformation (‘scaling’) that is a standard pre-processing step prior to dimensional reduction. The ScaleData function: Shifts the expression of each gene, so that the mean expression across cells is 0 Scales the expression of each gene, so that the variance across cells is 1 This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate The results of this are stored in larva[[“RNA”]]@scale.data
all.genes.no.iegs.larva <- rownames(x = larva)
larva<- ScaleData(object =larva, features = all.genes.no.iegs.larva)
## Centering and scaling data matrix
all.genes.no.iegs.adult <- rownames(x = adult)
adult<- ScaleData(object =adult, features = all.genes.no.iegs.adult)
## Centering and scaling data matrix
Perform linear dimensional reduction (PCA on scaled data). By default, only the previously determined variable features are used as input, but can be defined using features argument if you wish to choose a different subset.
larva <- RunPCA(object = larva, features = VariableFeatures(object = larva))
## PC_ 1
## Positive: fam60al, si:dkey-151g10.6, rplp2l, stmn2b, rpl3, rpl26, rps12, rpl19, actb2, tubb5
## rplp1, tpt1, cirbpb, rpl37, tmsb, fxyd6l, actb1, eef1a1l1, rps2, rpl7a
## rpl11, rps28, rpl23, ybx1, rps19, rpsa, rpl8, rps15a, rps7, rpl37a
## Negative: aldocb, gapdhs, eno1a, ckbb, higd1a, stmn1b, fam107b, bhlhe41, sncg, sh3gl2
## pkma, scg2b, si:dkey-7j14.5, gpia, tob1a, pgk1, rgs16, nfil3-6, si:dkeyp-72g9.4, si:ch211-195b13.1
## nrgna, dnajb5, ahcyl1, oaz2b, grin1a, slmapb, ywhag1, map4, sybu, ndufa4
## PC_ 2
## Positive: eomesa, elavl4, onecut1, crabp1a, gas7a, syt1a, tspan18a, nmbb, c1qtnf4, si:dkey-152p16.6
## slc16a3, plcxd3, camkvb, kcnd3, pou2f2a, cacng7b, pim3, itm2cb, rdh10a, slc6a1a
## celf4, necab1, plxna4, gap43, p2rx3b, pdlim5b, c1ql3a, si:ch211-222l21.1, cadm4, cabz01063757.1
## Negative: calb2a, calb2b, rbfox1, gnao1b, pou4f1, calm1a, cplx2l, tp53i11a, nrgna, zgc:162707
## gng13b, syt2a, elmo1, zgc:158291, ywhag1, si:dkey-164f24.2, homer3, olfm1b, cabp5b, hmx4
## eno2, grin1a, osbpl10, kcnip3b, rtn4rl2a, scrt1a, rgs8, isl2b, kctd5, nrn1b
## PC_ 3
## Positive: meis2b, pax10, kcnip3b, ebf3a, grin1b, zgc:110340, stmn1b, bcl11ba, nrgna, sulf2b
## pou4f2, sema3b, fam19a1a, prdm8b, phlda3, rprma, bx957297.1, tfap2d, pcdh10b, mamdc2b
## pou4f3, nfil3-6, sox4a, tp53i11b, foxg1b, cr381599.1, satb2, ramp1, tmeff2a, ptprga
## Negative: pcp4a, nell2b, zfhx3, mafaa, mef2cb, tubb2, olfm1b, efhd1, elmo1, atp5g1
## grm6b, isl2a, gria3a, atp5g3b, olfm3a, cox7b, cdh11, slc25a3b, vamp1, ndufa4
## gphnb, pou6f2, usmg5, gng13b, inab, zfhx4, si:dkey-238f9.1, zgc:65851, cox6c, cntn1b
## PC_ 4
## Positive: mef2cb, efhd1, ntrk3b, isl2b, gria3a, gabra2, isl2a, hunk, igfbp5b, olfm3a
## adcyap1b, tbr1b, cdh11, apof, zgc:92066, olfm1b, rtn4rl2a, necab1, epha7, homer3
## wls, sybu, pcdh19, pou6f2, bada, sox11a, mef2aa, glula, pcdh7b, islr2
## Negative: camkva, zgc:92658, tubb2, atp1a3b, zgc:65851, mafaa, edil3, pck1, uo:ion006, atp5g1
## si:ch211-67e16.11, nat8l, rprma, sox6, cacng3b, foxp2, sox11b, meis2b, usmg5, atp5g3b
## rbfox3, tagln3b, zgc:171482, igsf11, slc25a3b, foxp1b, calm1b, adka, cox7b, pcp4l1
## PC_ 5
## Positive: tspan18a, c1ql3a, plcxd3, npb, nmba, meis2b, etv1, barhl2, cfi, ntng1a
## pax10, si:dkeyp-72h1.1, cdh8, dbndd2, inhbaa, rims4, tfap2d, si:ch211-10p21.1, nr4a3, pcdh19
## sncga, galr1a, plce1, pou4f1, slc6a1b, fam46ba, cxcl14, prdm8b, sertad4, kcnip3b
## Negative: eomesa, nmbb, rdh10a, c1qtnf4, camkvb, crabp1a, grapa, isl2b, skor1b, si:dkey-1h6.8
## olfm1b, sox6, kctd4, pnoca, nptx1l, bmp4, ctnnal1, cnih3, necab1, tbx20
## lrrtm1, plxna4, pcp4l1, cygb2, camkva, spon1b, vcam1, adcy2a, ndnfl, pcp4a
adult <- RunPCA(object = adult, features = VariableFeatures(object = adult))
## PC_ 1
## Positive: gap43, tmsb, tubb5, marcksb, prph, vps37d, alcamb, cspg5a, tubb4b, zic3
## sncb, ywhaz, hunk, islr2, cnp, si:ch211-222l21.1, tspan18a, basp1, ptmab, marcksl1a
## atf4b, vat1, pou2f2a, mex3a, casp3a, marcksl1b, ptmaa, b2m, actb2, actb1
## Negative: calb2b, pcdh7b, fam107b, stmn3, gng13b, atp1a3b, rgs8, mef2cb, gria3a, si:dkeyp-72g9.4
## si:ch211-270g19.5, pcp4a, vamp1, adcyap1b, scrt1a, relt, efhd1, zfp36, syt2, eef1a2
## nptx1l, mef2aa, elmo1, rgs16, vstm2b, calb2a, aldocb, ca4a, atp1b3a, grm6b
## PC_ 2
## Positive: tmsb, tubb5, alcamb, vps37d, zfhx3, prph, zic3, cxcr4b, pcdh7b, casp3a
## mex3a, fabp7a, ppp1r14bb, kidins220a, pimr138, insm1a, b2m, alcama, zic2a, fxyd6l
## bcam, cnp, gria3a, vat1, lima1a, si:dkey-238o13.4, pou6f2, adh8b, znf395, tuba8l3
## Negative: hsp70.3, hsp70.2, calb2a, ubb, hsp70.1, marcksl1b, nrgna, dnajb1b, aldocb, nfil3-6
## gapdhs, ube2ql1, grin1b, hspa5, ywhag1, fndc4a, si:ch211-195b13.1, ppp1r1c, kcnip3b, sncga
## arg2, cdk15, prelid3b, cxcl14, tob1b, cbx7a, laptm4b, slc6a17, mknk2b, nocta
## PC_ 3
## Positive: c1ql3a, plcxd3, tspan18a, cfi, aldocb, npb, gapdhs, nell2b, elavl4, si:dkeyp-72h1.1
## etv1, htr1ab, ntng1a, pvalb6, p2rx3b, si:dkey-152p16.6, kcnd3, eno1a, gas7a, necab1
## rims4, zgc:65851, fxyd6, si:dkey-33c12.3, pou6f2, zfhx3, chga, clu, pou2f2a, map1ab
## Negative: kcnip3b, calb2b, zgc:110340, vps37d, calb2a, alcamb, nrgna, cr381599.1, si:dkey-151g10.6, cox4i2
## cartpt, fabp7a, c2cd4a, ppp1r1c, rplp1, ccdc136a, phlda3, meis2b, rplp2l, rps2
## fxyd6l, hmgb2a, rprma, si:dkey-164f24.2, ptmab, tubb5, nfil3-5, cxcr4b, casp3a, rps12
## PC_ 4
## Positive: grin1b, marcksl1b, pou4f2, si:dkey-33c12.3, meis2b, prdm8b, neurod2, tspan18a, sema3b, ntng1a
## cdk15, calb2a, pax10, mamdc2b, sncga, barhl2, cacng5a, cdh8, nrgna, cacng7b
## syt1a, neurod6a, zgc:171482, ccka, barhl1b, inhbaa, rprma, satb2, cxcl14, ebf1a
## Negative: adcyap1b, arg2, si:ch211-195b15.8, hsp70.3, zfp36, ubb, actb1, cbx7a, scg2b, rgs16
## pdcd4b, dnajb1b, syt4, pcdh7b, mef2cb, gng13b, mknk2b, mef2aa, stmn4l, laptm4b
## dnajb5, vim, prelid3b, hsp70.2, si:ch211-270g19.5, rgs8, atp1b3a, olfm1b, atf4b, hspa5
## PC_ 5
## Positive: mt2, sepp1a, mdka, rlbp1a, scg2, glula, apoeb, hsd11b2, pcsk1nl, aqp1a.1
## zfp36l1b, cahz, ppp1r14aa, s100a10b, fabp7a, zgc:165604, her6, tspan18a, si:ch211-152c2.3, atp1a1b
## sparc, phlda3, cr381599.1, icn, aldocb, hepacam, slc1a2b, gipr, sdpra, eif4ebp3l
## Negative: tuba1c, mfap4, si:dkey-1h6.8, tuba1a, pcdh11, olfm1b, isl2b, vim, tubb5, alcamb
## foxp2, rnd3b, zgc:153426, tmsb, fam163a, hsp70.2, prph, foxp1b, hsp70.1, epha7
## gap43, marcksl1b, pbx1a, calb2a, pcp4a, ntrk3b, epb41l4a, zgc:65894, vps37d, arl4ab
I use Harmony for batch correction. In the paper (https://www.nature.com/articles/s41592-019-0619-0) they describe the method as: “an algorithm that projects cells into a shared embedding in which cells group by cell type rather than dataset-specific conditions”. The batches are refered by the “orig.ident”, which we set in the add.cell.ids when generating the merged suerat object.
larva.corrected <- RunHarmony(larva, group.by.vars = "orig.ident")
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony converged after 4 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details
## on syntax validity
adult.corrected <- RunHarmony(adult, group.by.vars = "orig.ident")
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony 8/10
## Harmony converged after 8 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details
## on syntax validity
ElbowPlot(adult.corrected, ndims = 40)
Explanation from the Seurat website: "We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the FindNeighbors function, and takes as input the previously defined dimensionality of the dataset (first 10 PCs).
To cluster the cells, we next apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function. The FindClusters function implements this procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters can be found using the Idents function".
The dims define the number of PCs used. The reduction takes the batch corrected data from Harmony.
larva<- RunUMAP(larva, dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 16:20:58 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:20:58 Read 11380 rows and found 30 numeric columns
## 16:20:58 Using Annoy for neighbor search, n_neighbors = 30
## 16:20:58 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:21:00 Writing NN index file to temp file /tmp/RtmpvdmR40/filee763916609e0f
## 16:21:00 Searching Annoy index using 1 thread, search_k = 3000
## 16:21:03 Annoy recall = 100%
## 16:21:03 Commencing smooth kNN distance calibration using 1 thread
## 16:21:04 Initializing from normalized Laplacian + noise
## 16:21:04 Commencing optimization for 200 epochs, with 479380 positive edges
## 16:21:08 Optimization finished
larva.corrected<- RunUMAP(larva.corrected, reduction = "harmony", dims = 1:30)
## 16:21:08 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:21:08 Read 11380 rows and found 30 numeric columns
## 16:21:08 Using Annoy for neighbor search, n_neighbors = 30
## 16:21:08 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:21:10 Writing NN index file to temp file /tmp/RtmpvdmR40/filee7639405559c2
## 16:21:10 Searching Annoy index using 1 thread, search_k = 3000
## 16:21:13 Annoy recall = 100%
## 16:21:13 Commencing smooth kNN distance calibration using 1 thread
## 16:21:13 Initializing from normalized Laplacian + noise
## 16:21:14 Commencing optimization for 200 epochs, with 479184 positive edges
## 16:21:18 Optimization finished
adult<- RunUMAP(adult, dims = 1:30)
## 16:21:18 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:21:18 Read 30483 rows and found 30 numeric columns
## 16:21:18 Using Annoy for neighbor search, n_neighbors = 30
## 16:21:18 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:21:22 Writing NN index file to temp file /tmp/RtmpvdmR40/filee763926ce1fe4
## 16:21:22 Searching Annoy index using 1 thread, search_k = 3000
## 16:21:31 Annoy recall = 100%
## 16:21:31 Commencing smooth kNN distance calibration using 1 thread
## 16:21:32 Initializing from normalized Laplacian + noise
## 16:21:34 Commencing optimization for 200 epochs, with 1336038 positive edges
## 16:21:45 Optimization finished
adult.corrected<- RunUMAP(adult.corrected, reduction = "harmony", dims = 1:30)
## 16:21:45 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:21:45 Read 30483 rows and found 30 numeric columns
## 16:21:45 Using Annoy for neighbor search, n_neighbors = 30
## 16:21:45 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:21:49 Writing NN index file to temp file /tmp/RtmpvdmR40/filee763953abb50b
## 16:21:49 Searching Annoy index using 1 thread, search_k = 3000
## 16:21:59 Annoy recall = 100%
## 16:21:59 Commencing smooth kNN distance calibration using 1 thread
## 16:22:00 Initializing from normalized Laplacian + noise
## 16:22:01 Commencing optimization for 200 epochs, with 1377284 positive edges
## 16:22:14 Optimization finished
larva<- FindNeighbors(larva, dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
larva <- FindClusters(object = larva, resolution = 1.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11380
## Number of edges: 435294
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8998
## Number of communities: 36
## Elapsed time: 1 seconds
larva.corrected<- FindNeighbors(larva.corrected, reduction = "harmony", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
larva.corrected <- FindClusters(object = larva.corrected, resolution = 1.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11380
## Number of edges: 436837
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9008
## Number of communities: 36
## Elapsed time: 1 seconds
adult<- FindNeighbors(adult, dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
adult <- FindClusters(object = adult, resolution = 1.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 30483
## Number of edges: 1180451
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9214
## Number of communities: 34
## Elapsed time: 6 seconds
adult.corrected<- FindNeighbors(adult.corrected, reduction = "harmony", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
adult.corrected <- FindClusters(object = adult.corrected, resolution = 1.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 30483
## Number of edges: 1307651
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9261
## Number of communities: 33
## Elapsed time: 6 seconds
Visualize the UMAP. First check the batch correction (defined by using the group.by):
DimPlot(larva, group.by = c("orig.ident"))
DimPlot(larva.corrected, group.by = c("orig.ident"))
DimPlot(adult, group.by = c("orig.ident"))
DimPlot(adult.corrected, group.by = c("orig.ident"))
sample_features = c("eomesa", "tbr1b", "mafaa", "neurod1")
DotPlot(larva.corrected, features = sample_features) + RotatedAxis() + ggtitle("Sample Dot Plot")
sample_features = c("eomesa", "tbr1b", "mafaa", "neurod1")
DotPlot(adult.corrected, features = sample_features) + RotatedAxis() + ggtitle("Sample Dot Plot")
FeaturePlot(larva.corrected, reduction = "umap", label = TRUE, features = "eomesa")
FeaturePlot(adult.corrected, reduction = "umap", label = TRUE, features = "eomesa")
Violin. First check the batch correction (defined by using the group.by):
VlnPlot(larva.corrected,features = "nCount_RNA", group.by = c("orig.ident"))
FeaturePlot(adult.corrected, reduction = "umap", features = c("eomesa", "tbr1b", "mafaa", "neurod2"))
And now the unique cell types:
DimPlot(object = (larva.corrected), label=TRUE)
DimPlot(object = (adult.corrected), label=TRUE)
larva.markers <- FindAllMarkers(object = larva.corrected, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
DEGenes_vector <- vector()
for (i in levels(Idents(larva.corrected))) {
DEGenes_vector = union(DEGenes_vector, head(subset(larva.markers,
cluster == i))[1:3, ]$gene)
}
DEGenes_vector = tolower(DEGenes_vector[2:length(DEGenes_vector)])
# Plot DE genes
DotPlot(larva.corrected, features = DEGenes_vector) + RotatedAxis()
adult.markers <- FindAllMarkers(object = adult.corrected, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
DEGenes_vector <- vector()
for (i in levels(Idents(adult.corrected))) {
DEGenes_vector = union(DEGenes_vector, head(subset(adult.markers,
cluster == i))[1:3, ]$gene)
}
DEGenes_vector = tolower(DEGenes_vector[2:length(DEGenes_vector)])
# Plot DE genes
DotPlot(adult.corrected, features = DEGenes_vector) + RotatedAxis()
#' Plots the relative cluster size of each cluster in an object
#'
#' @param A Seurat object
PlotClusterSize = function(object){
num_cells <- dim(object@meta.data)[1]
cluster_percent <- vector()
for(i in levels(object@meta.data$clusterID)){
clus_cells <- length(which(object@meta.data$clusterID == i))
cluster_percent[i] <- clus_cells / num_cells * 100
}
barplot(cluster_percent, xlab = "Cluster ID", ylab = "Percentage of Cells [%]")
}
# Plot relative size of each cluster
PlotClusterSize(adult.corrected)
Find markers for every cluster compared to all remaining cells. From seurat webpage: “The min.pct argument requires a feature to be detected at a minimum percentage in either of the two groups of cells, and the thresh.test argument requires a feature to be differentially expressed (on average) by some amount between the two groups. You can set both of these to 0, but with a dramatic increase in time - since this will test a large number of features that are unlikely to be highly discriminatory. As another option to speed up these computations, max.cells.per.ident can be set. This will downsample each identity class to have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significiant and the most highly differentially expressed features will likely still rise to the top”.
To identify the which kind of cells are in each cluster I define the setting to report only the positive markers, the marker has to be expressed in at least 25% of the cells in the cluster and to have a log fold expression of at least 0.8.
Lets look on the top markers:
adult.markers %>% group_by(cluster) %>% top_n(n = 10,wt = avg_logFC)
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat
## # A tibble: 330 x 7
## # Groups: cluster [33]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0 1.30 0.718 0.234 0 0 zfp36
## 2 0 1.21 0.992 0.584 0 0 adcyap1b
## 3 0 0.934 0.926 0.521 0 0 rgs8
## 4 0 0.919 0.455 0.07 0 0 si:ch211-270g19.5
## 5 0 0.897 0.9 0.547 0 0 atp1b3a
## 6 0 0.875 0.77 0.34 0 0 nptx1l
## 7 0 0.868 0.566 0.126 0 0 mef2cb
## 8 0 0.866 0.413 0.057 0 0 grm6b
## 9 0 0.811 0.772 0.227 0 0 pcdh7b
## 10 0 0.794 0.580 0.154 0 0 gria3a
## # … with 320 more rows
plot the expression of a chosen genes on the UMAP. elavl3 are differentiated neurons, while the others are progenitors markers:
FeaturePlot(object = adult.corrected, features = c("elavl3", "pcvna", "fabp7a", "her4.1"))
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: pcvna
We can use this data to subset the differentiated neurons and recluster them.
#if only some clusters are neurons
#larva.neurons<- subset(x = larva, idents=c("0", "1", "2", "3", "4","5", "10", "11", "14", "18", "22", "23"))
adult.neurons <- adult.corrected
how many neurons do we have:
sum(table(...=adult.neurons@active.ident))
## [1] 30483
Lets look on the neurotransmiters expression:
FeaturePlot(object = larva.corrected,features = c("gad2", "gad1b", "slc6a1b", "chata", "slc17a6a", "slc17a6b", "slc17a7a", "slc6a5", "slc6a9"))
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: chata, slc17a6a, slc17a7a, slc6a5
We can use this data to recluster only the inhibitory or the excitatory neurons etc.